Last updated: 2015-09-23

Code version: 89227f632516ff3ca6a451e84fde3fedee9c7b8a

Comparing the conversion of reads to molecules for each cell. Used three different metrics:

Input

library("dplyr")
library("ggplot2")
theme_set(theme_bw(base_size = 16))
library("edgeR")
source("functions.R")
library("tidyr")

Input annotation.

anno <- read.table("../data/annotation.txt", header = TRUE,
                   stringsAsFactors = FALSE)
head(anno)
  individual batch well     sample_id
1      19098     1  A01 NA19098.1.A01
2      19098     1  A02 NA19098.1.A02
3      19098     1  A03 NA19098.1.A03
4      19098     1  A04 NA19098.1.A04
5      19098     1  A05 NA19098.1.A05
6      19098     1  A06 NA19098.1.A06

Input read counts.

reads <- read.table("../data/reads.txt", header = TRUE,
                    stringsAsFactors = FALSE)

Input molecule counts.

molecules <- read.table("../data/molecules.txt", header = TRUE,
                    stringsAsFactors = FALSE)

Input list of quality single cells.

quality_single_cells <- scan("../data/quality-single-cells.txt",
                             what = "character")

Keep only the single cells that passed the QC filters.

reads <- reads[, colnames(reads) %in% quality_single_cells]
molecules <- molecules[, colnames(molecules) %in% quality_single_cells]
anno <- anno[anno$sample_id %in% quality_single_cells, ]
stopifnot(dim(reads) == dim(molecules),
          nrow(anno) == ncol(reads))

Distribution of fold change to mean

Look at the distribution of fold change to mean. As being reported by others, the lowly expressed genes show divergent read and molecule counts

## calculate mean
reads_mean     <- apply(reads, 1, mean)
molecules_mean <- apply(molecules, 1, mean)
distribution <- data.frame(reads_mean, molecules_mean)

## calculate fold change to mean
distribution$fold_change_read     <- log2(reads_mean/mean(reads_mean))
distribution$fold_change_molecule <- log2(molecules_mean/mean(molecules_mean))

## select ERCC
distribution$ERCC <- grepl("ERCC", rownames(distribution))

## color palette
cbPalette <- c("#999999", "#0000FF", "#990033", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#009E73")

ggplot(distribution, aes(x = fold_change_molecule, y = fold_change_read, col = ERCC)) + geom_point(size = 3, alpha = 0.5) + scale_colour_manual(values=cbPalette) + stat_function(fun= function(x) {x}, col= "#56B4E9") + labs(x = "log2 fold change to mean (molecule)", y =  "log2 fold change to mean (reads)")

Transformation

In addition to comparing the raw counts of reads and molecules, we compare the log2 counts and the log counts per million.

For the log counts, I add a pseudocount of 1.

reads_log <- log2(reads + 1)
molecules_log <- log2(molecules + 1)

standardized by cmp and log-transform

reads_cpm <- cpm(reads, log = FALSE)
molecules_cpm <- cpm(molecules, log = FALSE)

Calculate cpm for the reads data using TMM-normalization.

norm_factors_reads <- calcNormFactors(reads, method = "TMM")
reads_tmm <- cpm(reads, lib.size = colSums(reads) * norm_factors_reads,
                 log = TRUE)

And for the molecules.

norm_factors_mol <- calcNormFactors(molecules, method = "TMM")
molecules_tmm <- cpm(molecules, lib.size = colSums(molecules) * norm_factors_mol,
                     log = TRUE)

conversion in each single cell

Counts

Compare the counts. (1) all genes. (2) only ERCC. (3) without ERCC, only endogenous genes

All genes

## linear regression per cell and make a table with intercept, slope, and r-squared
regression_table <- as.data.frame(do.call(rbind,lapply(names(reads),function(x){
  fit.temp <- lm(molecules[,x]~reads[,x])
    c(x,fit.temp$coefficients,summary(fit.temp)$adj.r.squared)
})))
names(regression_table) <- c("sample_id","Intercept","slope","r2")
regression_table$Intercept <- as.numeric(as.character(regression_table$Intercept))
regression_table$slope <- as.numeric(as.character(regression_table$slope))
regression_table$r2 <- as.numeric(as.character(regression_table$r2))
plot(regression_table$r2)

anno_regression <- merge(anno,regression_table,by="sample_id")

ggplot(anno_regression,aes(x=Intercept,y=slope,col=as.factor(individual),shape=as.factor(batch))) + geom_point() + labs(x = "intercept", y = "slope", title = "read-molecule conversion (counts)")

ggplot(anno_regression,aes(x=Intercept,y=slope,col=as.factor(individual),shape=as.factor(batch))) + geom_point() + labs(x = "intercept", y = "slope", title = "read-molecule conversion (counts)") + facet_grid(individual ~ batch)

## plot all the lines
anno_regression$reads_mean <- apply(reads, 2, mean)
anno_regression$molecules_mean <- apply(molecules, 2, mean)

ggplot(anno_regression, aes(x= reads_mean, y= molecules_mean)) + geom_point() + geom_abline(aes(intercept=Intercept, slope=slope, col=as.factor(individual), alpha= 0.5), data=anno_regression) + facet_grid(individual ~ batch)

Pairwise distance in (intercept, slope) between batches or individuals.

  • Compute pairwise Euclidean distance between cells.
anno_regression_dist <- as.matrix( dist(anno_regression[ , c("Intercept", "slope")]) )
rownames(anno_regression_dist) <- with(anno_regression,
                                       paste(individual, batch, sep = "_"))
colnames(anno_regression_dist) <- rownames(anno_regression_dist)
anno_regression_dist[1:2, 1:2]
          19098_1   19098_1
19098_1 0.0000000 0.5022502
19098_1 0.5022502 0.0000000
  • Cell-to-cell Euclidean distance within each individiaual, batch
## All cells
dist_mean <- data.frame(dist = rowSums(anno_regression_dist)/(ncol(anno_regression_dist) - 1))
ggplot(dist_mean, aes(y = dist, x = 1)) + geom_violin(alpha = .5) + 
  geom_boxplot(alpha = .01, width = .2, position = position_dodge(width = .9)) +
  labs(title = "Mean cell-to-cell distance in (intercept, slope)", x = "") +
  theme(axis.text.x = element_text(hjust=1, angle = 45))

## Within batch 
batch_unique <- with(anno_regression, paste(individual, batch, sep = "_"))
dist_vec <- lapply(1:length(unique(batch_unique)), function(per_batch) {
  dist_foo <- anno_regression_dist[batch_unique == unique(batch_unique)[per_batch],
                             batch_unique == unique(batch_unique)[per_batch] ]
  data.frame(dist = c(rowSums(dist_foo)/(ncol(dist_foo) - 1) ),
             batch_unique = rep(unique(batch_unique)[per_batch], ncol(dist_foo) )  )
})
dist_vec <- do.call(rbind, dist_vec)
str(dist_vec)
'data.frame':   578 obs. of  2 variables:
 $ dist        : num  0.403 0.343 0.357 0.329 0.616 ...
 $ batch_unique: Factor w/ 9 levels "19098_1","19098_2",..: 1 1 1 1 1 1 1 1 1 1 ...
ggplot(dist_vec, aes(x= factor(batch_unique), y = dist, fill = factor(batch_unique)) ) +
  geom_violin(alpha = .5) + geom_boxplot(alpha = .01, width = .2, position = position_dodge(width = .9)) +
  labs(title = "Mean cell-to-cell distance within batch \n
                in (intercept, slope)") +
  theme(axis.text.x = element_text(hjust=1, angle = 45))

## Within individual
individual <- anno_regression$individual
dist_vec <- lapply(1:length(unique(individual)), function(per_individual) {
  dist_foo <- anno_regression_dist[individual == unique(individual)[per_individual],
                             individual == unique(individual)[per_individual] ]
  data.frame(dist = c(rowSums(dist_foo)/(ncol(dist_foo) - 1)),
             individual = rep(unique(individual)[per_individual], ncol(dist_foo) )  )
})
dist_vec <- do.call(rbind, dist_vec)
str(dist_vec)
'data.frame':   578 obs. of  2 variables:
 $ dist      : num  0.482 0.431 0.442 0.415 0.661 ...
 $ individual: int  19098 19098 19098 19098 19098 19098 19098 19098 19098 19098 ...
ggplot(dist_vec, aes(x= factor(individual), y = dist, fill = factor(individual)),
       height = 600, width = 2000) +
  geom_violin(alpha = .5) + geom_boxplot(alpha = .01, width = .2, position = position_dodge(width = .9)) +
  labs(title = "Mean cell-to-cell distance within individual \n
                in (intercept, slope)") +
  theme(axis.text.x = element_text(hjust=1, angle = 45))

Only ERCC genes

## grep ERCC
reads_ERCC <- reads[grep("ERCC", rownames(reads)), ]
molecules_ERCC <- molecules[grep("ERCC", rownames(molecules)), ]

## linear regression per cell and make a table with intercept, slope, and r-squared
regression_table_ERCC <- as.data.frame(do.call(rbind,lapply(names(reads_ERCC),function(x){
  fit.temp <- lm(molecules_ERCC[,x]~reads_ERCC[,x])
    c(x,fit.temp$coefficients,summary(fit.temp)$adj.r.squared)
})))
names(regression_table_ERCC) <- c("sample_id","Intercept","slope","r2")
regression_table_ERCC$Intercept <- as.numeric(as.character(regression_table_ERCC$Intercept))
regression_table_ERCC$slope <- as.numeric(as.character(regression_table_ERCC$slope))
regression_table_ERCC$r2 <- as.numeric(as.character(regression_table_ERCC$r2))
plot(regression_table_ERCC$r2)

anno_regression_ERCC <- merge(anno,regression_table_ERCC,by="sample_id")
ggplot(anno_regression_ERCC,aes(x=Intercept,y=slope,col=as.factor(individual),shape=as.factor(batch))) + geom_point() + labs(x = "intercept", y = "slope", title = "read-molecule conversion ERCC genes (counts)") + facet_grid(individual ~ batch)

## plot all the lines
anno_regression_ERCC$reads_mean <- apply(reads_ERCC, 2, mean)
anno_regression_ERCC$molecules_mean <- apply(molecules_ERCC, 2, mean)

ggplot(anno_regression_ERCC, aes(x= reads_mean, y= molecules_mean)) + geom_point() + geom_abline(aes(intercept=Intercept, slope=slope, col=as.factor(individual), alpha= 0.5), data=anno_regression) + facet_grid(individual ~ batch)

Pairwise distance in (intercept, slope) between batches or individuals.

  • Compute pairwise Euclidean distance between cells.
anno_regression_ERCC_dist <- as.matrix( dist(anno_regression_ERCC[ , c("Intercept", "slope")]) )
rownames(anno_regression_ERCC_dist) <- with(anno_regression_ERCC,
                                            paste(individual, batch, sep = "_"))
colnames(anno_regression_ERCC_dist) <- rownames(anno_regression_ERCC_dist)
anno_regression_ERCC_dist[1:2, 1:2]
          19098_1   19098_1
19098_1 0.0000000 0.6214327
19098_1 0.6214327 0.0000000
  • Cell-to-cell Euclidean distance within each individiaual, batch
## All cells
dist_mean <- data.frame(dist = rowSums(anno_regression_ERCC_dist)/(ncol(anno_regression_ERCC_dist) - 1))
ggplot(dist_mean, aes(y = dist, x = 1)) + geom_violin(alpha = .5) + 
  geom_boxplot(alpha = .01, width = .2, position = position_dodge(width = .9)) +
  labs(title = "Mean cell-to-cell distance in (intercept, slope)", x = "") +
  theme(axis.text.x = element_text(hjust=1, angle = 45))

## Within batch 
batch_unique <- with(anno_regression, paste(individual, batch, sep = "_"))
dist_vec <- lapply(1:length(unique(batch_unique)), function(per_batch) {
  dist_foo <- anno_regression_ERCC_dist[batch_unique == unique(batch_unique)[per_batch],
                             batch_unique == unique(batch_unique)[per_batch] ]
  data.frame(dist = c(rowSums(dist_foo)/(ncol(dist_foo) - 1)),
             batch_unique = rep(unique(batch_unique)[per_batch], ncol(dist_foo) )  )
})
dist_vec <- do.call(rbind, dist_vec)
str(dist_vec)
'data.frame':   578 obs. of  2 variables:
 $ dist        : num  0.442 0.606 0.608 0.436 0.441 ...
 $ batch_unique: Factor w/ 9 levels "19098_1","19098_2",..: 1 1 1 1 1 1 1 1 1 1 ...
ggplot(dist_vec, aes(x= factor(batch_unique), y = dist, fill = factor(batch_unique)),
       height = 600, width = 2000) +
  geom_violin(alpha = .5) + geom_boxplot(alpha = .01, width = .2, position = position_dodge(width = .9)) +
  labs(title = "Mean cell-to-cell distance within batch \n
                in (intercept, slope)") +
  theme(axis.text.x = element_text(hjust=1, angle = 45))

## Within individual
individual <- anno_regression_ERCC$individual
dist_vec <- lapply(1:length(unique(individual)), function(per_individual) {
  dist_foo <- anno_regression_ERCC_dist[individual == unique(individual)[per_individual],
                             individual == unique(individual)[per_individual] ]
  data.frame(dist = c(rowSums(dist_foo)/(ncol(dist_foo) - 1)),
             individual = rep(unique(individual)[per_individual], ncol(dist_foo) )  ) 
})
dist_vec <- do.call(rbind, dist_vec)
str(dist_vec)
'data.frame':   578 obs. of  2 variables:
 $ dist      : num  0.514 0.729 0.732 0.515 0.535 ...
 $ individual: int  19098 19098 19098 19098 19098 19098 19098 19098 19098 19098 ...
ggplot(dist_vec, aes(x= factor(individual), y = dist, fill = factor(individual)),
       height = 600, width = 2000) +
  geom_violin(alpha = .5) + geom_boxplot(alpha = .01, width = .2, position = position_dodge(width = .9)) +
  labs(title = "Mean cell-to-cell distance within individual \n
                in (intercept, slope)") +
  theme(axis.text.x = element_text(hjust=1, angle = 45))

Without ERCC genes, only endogenous genes

## grep ENSG
reads_ENSG <- reads[grep("ENSG", rownames(reads)), ]
molecules_ENSG <- molecules[grep("ENSG", rownames(molecules)), ]

## linear regression per cell and make a table with intercept, slope, and r-squared
regression_table_ENSG <- as.data.frame(do.call(rbind,lapply(names(reads_ENSG),function(x){
  fit.temp <- lm(molecules_ENSG[,x]~reads_ENSG[,x])
    c(x,fit.temp$coefficients,summary(fit.temp)$adj.r.squared)
})))
names(regression_table_ENSG) <- c("sample_id","Intercept","slope","r2")
regression_table_ENSG$Intercept <- as.numeric(as.character(regression_table_ENSG$Intercept))
regression_table_ENSG$slope <- as.numeric(as.character(regression_table_ENSG$slope))
regression_table_ENSG$r2 <- as.numeric(as.character(regression_table_ENSG$r2))
plot(regression_table_ENSG$r2)

anno_regression_ENSG <- merge(anno,regression_table_ENSG,by="sample_id")

ggplot(anno_regression_ENSG,aes(x=Intercept,y=slope,col=as.factor(individual),shape=as.factor(batch))) + geom_point() + labs(x = "intercept", y = "slope", title = "lm of read-molecule conversion endogenous genes in each cell (counts)")

ggplot(anno_regression_ENSG,aes(x=Intercept,y=slope,col=as.factor(individual),shape=as.factor(batch))) + geom_point() + labs(x = "intercept", y = "slope", title = "lm of read-molecule conversion endogenous genes in each cell (counts)") + facet_grid(individual ~ batch)

## plot all the lines
anno_regression_ENSG$reads_mean <- apply(reads_ENSG, 2, mean)
anno_regression_ENSG$molecules_mean <- apply(molecules_ENSG, 2, mean)

ggplot(anno_regression_ENSG, aes(x= reads_mean, y= molecules_mean)) + geom_point() + geom_abline(aes(intercept=Intercept, slope=slope, col=as.factor(individual), alpha= 0.5), data=anno_regression_ENSG)

ggplot(anno_regression_ENSG, aes(x= reads_mean, y= molecules_mean)) + geom_point() + geom_abline(aes(intercept=Intercept, slope=slope, col=as.factor(individual), alpha= 0.5), data=anno_regression_ENSG) + facet_grid(individual ~ batch)

Pairwise distance in (intercept, slope) between batches or individuals.

  • Compute pairwise Euclidean distance between cells.
anno_regression_ENSG_dist <- as.matrix( dist(anno_regression_ENSG[ , c("Intercept", "slope")]) )
rownames(anno_regression_ENSG_dist) <- with(anno_regression_ENSG,
                                            paste(individual, batch, sep = "_"))
colnames(anno_regression_ENSG_dist) <- rownames(anno_regression_ENSG_dist)
anno_regression_ENSG_dist[1:2, 1:2]
          19098_1   19098_1
19098_1 0.0000000 0.4931873
19098_1 0.4931873 0.0000000
  • Cell-to-cell Euclidean distance within each individiaual, batch
## All cells
dist_mean <- data.frame(dist = rowSums(anno_regression_ENSG_dist)/(ncol(anno_regression_ENSG_dist) - 1))
ggplot(dist_mean, aes(y = dist, x = 1)) + geom_violin(alpha = .5) + 
  geom_boxplot(alpha = .01, width = .2, position = position_dodge(width = .9)) +
  labs(title = "Mean cell-to-cell distance in (intercept, slope)", x = "") +
  theme(axis.text.x = element_text(hjust=1, angle = 45))

## Within batch 
batch_unique <- with(anno_regression_ENSG, paste(individual, batch, sep = "_"))
dist_vec <- lapply(1:length(unique(batch_unique)), function(per_batch) {
  dist_foo <- anno_regression_ENSG_dist[batch_unique == unique(batch_unique)[per_batch],
                             batch_unique == unique(batch_unique)[per_batch] ]
  data.frame(dist = c(rowSums(dist_foo)/(ncol(dist_foo) - 1)),
             batch_unique = rep(unique(batch_unique)[per_batch], ncol(dist_foo) )  )
})
dist_vec <- do.call(rbind, dist_vec)
str(dist_vec)
'data.frame':   578 obs. of  2 variables:
 $ dist        : num  0.404 0.344 0.363 0.332 0.629 ...
 $ batch_unique: Factor w/ 9 levels "19098_1","19098_2",..: 1 1 1 1 1 1 1 1 1 1 ...
ggplot(dist_vec, aes(x= factor(batch_unique), y = dist, fill = factor(batch_unique)),
       height = 600, width = 2000) +
  geom_violin(alpha = .5) + geom_boxplot(alpha = .01, width = .2, position = position_dodge(width = .9)) +
  labs(title = "Mean cell-to-cell distance within batch \n
                in (intercept, slope)") +
  theme(axis.text.x = element_text(hjust=1, angle = 45))

## Within individual
individual <- anno_regression_ENSG$individual
dist_vec <- lapply(1:length(unique(individual)), function(per_individual) {
  dist_foo <- anno_regression_ENSG_dist[individual == unique(individual)[per_individual],
                             individual == unique(individual)[per_individual] ]
  data.frame(dist = c(rowSums(dist_foo)/(ncol(dist_foo) - 1)),
             individual = rep(unique(individual)[per_individual], ncol(dist_foo) )  )
})
dist_vec <- do.call(rbind, dist_vec)
str(dist_vec)
'data.frame':   578 obs. of  2 variables:
 $ dist      : num  0.457 0.429 0.446 0.404 0.699 ...
 $ individual: int  19098 19098 19098 19098 19098 19098 19098 19098 19098 19098 ...
ggplot(dist_vec, aes(x= factor(individual), y = dist, fill = factor(individual)),
       height = 600, width = 2000) +
  geom_violin(alpha = .5) + geom_boxplot(alpha = .01, width = .2, position = position_dodge(width = .9)) +
  labs(title = "Mean cell-to-cell distance within individual \n
                in (intercept, slope)") +
  theme(axis.text.x = element_text(hjust=1, angle = 45))

There is a clear difference between individuals. How about difference between batches? Use 9 different colors to visualize the 9 batches

ggplot(anno_regression_ENSG, aes(x = Intercept, y = slope, col = paste(individual, batch, sep = "."))) +
  geom_point() +
  scale_color_brewer(palette = "Set1", name = "9 batches") +
  labs(title = "lm of read-molecule conversion endogenous genes in each cell (counts)")

Here are the batches split by individual.

by_batch <- ggplot(anno_regression_ENSG, aes(x = Intercept, y = slope)) +
            geom_point(aes(color = as.factor(batch))) +
            facet_wrap(~individual, nrow = 3) +
            scale_color_brewer(palette = "Dark2", name = "batch") +
            labs(title = "lm of read-molecule conversion endogenous genes in each cell (counts)")
by_batch

by_batch + geom_smooth(aes(group=batch, color = as.factor(batch)), method="lm") 

19098 have more total endogenous gene molecules and more ERCC molecule

anno_regression$total_molecule_ERCC <- apply(molecules_ERCC, 2, sum)
anno_regression$total_molecule_ENSG <- apply(molecules_ENSG, 2, sum)
anno_regression$total_molecule <- apply(molecules, 2, sum) 
ggplot(anno_regression, aes(x= total_molecule_ERCC, y= total_molecule,col=as.factor(individual),shape=as.factor(batch))) + geom_point()

ggplot(anno_regression, aes(x = as.factor(individual), y = total_molecule_ENSG)) + geom_boxplot(aes(fill = as.factor(batch)))

ggplot(anno_regression, aes(x = as.factor(individual), y = total_molecule_ERCC)) + geom_boxplot(aes(fill = as.factor(batch)))

Log2 counts

Compare the log counts. The r squared values of lm are relativiely low (around 0.85).

regression_table_log <- as.data.frame(do.call(rbind,lapply(names(reads_log),function(x){
  fit.temp <- lm(molecules_log[,x]~reads_log[,x])
    c(x,fit.temp$coefficients,summary(fit.temp)$adj.r.squared)
})))
names(regression_table_log) <- c("sample_id","Intercept","slope","r2")
regression_table_log$Intercept <- as.numeric(as.character(regression_table_log$Intercept))
regression_table_log$slope <- as.numeric(as.character(regression_table_log$slope))
regression_table_log$r2 <- as.numeric(as.character(regression_table_log$r2))
plot(regression_table_log$r2)

anno_regression_log <- merge(anno,regression_table_log,by="sample_id")
ggplot(anno_regression_log,aes(x=Intercept,y=slope,col=as.factor(individual),shape=as.factor(batch))) + geom_point() + labs(x = "intercept", y = "slope", title = "read-molecule conversion (log2)")

counts per million

Compare the counts per million.

regression_table_cpm <- as.data.frame(do.call(rbind,lapply(colnames(reads_cpm),function(x){
  fit.temp <- lm(molecules_cpm[,x]~reads_cpm[,x])
    c(x,fit.temp$coefficients,summary(fit.temp)$adj.r.squared)
})))
names(regression_table_cpm) <- c("sample_id","Intercept","slope","r2")
regression_table_cpm$Intercept <- as.numeric(as.character(regression_table_cpm$Intercept))
regression_table_cpm$slope <- as.numeric(as.character(regression_table_cpm$slope))
regression_table_cpm$r2 <- as.numeric(as.character(regression_table_cpm$r2))
plot(regression_table_cpm$r2)

anno_regression_cpm <- merge(anno,regression_table_cpm,by="sample_id")

ggplot(anno_regression_cpm,aes(x=Intercept,y=slope,col=as.factor(individual),shape=as.factor(batch))) + geom_point() + labs(x = "intercept", y = "slope", title = "read-molecule conversion (cpm no log)")

ggplot(anno_regression_cpm,aes(x=Intercept,y=slope,col=as.factor(individual),shape=as.factor(batch))) + geom_point() + labs(x = "intercept", y = "slope", title = "read-molecule conversion (cpm no log)") + facet_grid(individual ~ batch) 

## plot all the lines
anno_regression_cpm$reads_mean <- apply(reads_cpm, 2, mean)
anno_regression_cpm$molecules_mean <- apply(molecules_cpm, 2, mean)

ggplot(anno_regression_cpm, aes(x= reads_mean, y= molecules_mean)) + geom_point() + geom_abline(aes(intercept=Intercept, slope=slope, col=as.factor(individual), alpha = 0.01), data=anno_regression_cpm) + facet_grid(individual ~ batch) 

## look at just the slope using total raw molecule counts (total cpm molecule counts are all the same across cells)
anno_regression_cpm$total_molecule <- apply(molecules, 2, sum)
ggplot(anno_regression_cpm, aes(x= total_molecule, y= slope,col=as.factor(individual),shape=as.factor(batch))) + geom_point()

Only ERCC genes

## grep ERCC
reads_cpm_ERCC <- as.data.frame(reads_cpm[grep("ERCC", rownames(reads_cpm)), ])
molecules_cpm_ERCC <- as.data.frame(molecules_cpm[grep("ERCC", rownames(molecules_cpm)), ])

## linear regression per cell and make a table with intercept, slope, and r-squared
regression_table_cpm_ERCC <- as.data.frame(do.call(rbind,lapply(names(reads_cpm_ERCC),function(x){
  fit.temp <- lm(molecules_cpm_ERCC[,x]~reads_cpm_ERCC[,x])
    c(x,fit.temp$coefficients,summary(fit.temp)$adj.r.squared)
})))
names(regression_table_cpm_ERCC) <- c("sample_id","Intercept","slope","r2")
regression_table_cpm_ERCC$Intercept <- as.numeric(as.character(regression_table_cpm_ERCC$Intercept))
regression_table_cpm_ERCC$slope <- as.numeric(as.character(regression_table_cpm_ERCC$slope))
regression_table_cpm_ERCC$r2 <- as.numeric(as.character(regression_table_cpm_ERCC$r2))
plot(regression_table_cpm_ERCC$r2)

anno_regression_cpm_ERCC <- merge(anno,regression_table_cpm_ERCC,by="sample_id")

ggplot(anno_regression_cpm_ERCC,aes(x=Intercept,y=slope,col=as.factor(individual),shape=as.factor(batch))) + geom_point() + labs(x = "intercept", y = "slope", title = "read-molecule conversion ERCC genes (cpm no log)") 

## plot all the lines
anno_regression_cpm_ERCC$reads_mean <- apply(reads_cpm_ERCC, 2, mean)
anno_regression_cpm_ERCC$molecules_mean <- apply(molecules_cpm_ERCC, 2, mean)

ggplot(anno_regression_cpm_ERCC, aes(x= reads_mean, y= molecules_mean)) + geom_point() + geom_abline(aes(intercept=Intercept, slope=slope, col=as.factor(individual), alpha= 0.5), data=anno_regression_cpm_ERCC) + facet_grid(individual ~ batch)

TMM-normalized counts per million

Compare TMM-normalized log2-transformed cpm .

regression_table_tmm <- as.data.frame(do.call(rbind,lapply(colnames(reads_tmm),function(x){
  fit.temp <- lm(molecules_tmm[,x]~reads_tmm[,x])
    c(x,fit.temp$coefficients,summary(fit.temp)$adj.r.squared)
})))
names(regression_table_tmm) <- c("sample_id","Intercept","slope","r2")
regression_table_tmm$Intercept <- as.numeric(as.character(regression_table_tmm$Intercept))
regression_table_tmm$slope <- as.numeric(as.character(regression_table_tmm$slope))
regression_table_tmm$r2 <- as.numeric(as.character(regression_table_tmm$r2))
plot(regression_table_tmm$r2)

anno_regression_tmm <- merge(anno,regression_table_tmm,by="sample_id")

ggplot(anno_regression_tmm,aes(x=Intercept,y=slope,col=as.factor(individual),shape=as.factor(batch))) + geom_point() + labs(x = "intercept", y = "slope", title = "read-molecule conversion (tmm cpm log)")

ggplot(anno_regression_tmm,aes(x=Intercept,y=slope,col=as.factor(individual),shape=as.factor(batch))) + geom_point() + labs(x = "intercept", y = "slope", title = "read-molecule conversion (tmm cpm log)") + facet_grid(individual ~ batch) 

## plot all the lines
anno_regression_tmm$reads_mean <- apply(reads_tmm, 2, mean)
anno_regression_tmm$molecules_mean <- apply(molecules_tmm, 2, mean)

ggplot(anno_regression_tmm, aes(x= reads_mean, y= molecules_mean)) + geom_point() + geom_abline(aes(intercept=Intercept, slope=slope, col=as.factor(individual), alpha = 0.01), data=anno_regression_tmm) + facet_grid(individual ~ batch) 

Session information

sessionInfo()
R version 3.2.1 (2015-06-18)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.10.4 (Yosemite)

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] tidyr_0.3.1   edgeR_3.10.2  limma_3.24.15 ggplot2_1.0.1 dplyr_0.4.3  
[6] knitr_1.11   

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.1        magrittr_1.5       MASS_7.3-44       
 [4] munsell_0.4.2      colorspace_1.2-6   R6_2.1.1          
 [7] stringr_1.0.0      plyr_1.8.3         tools_3.2.1       
[10] parallel_3.2.1     grid_3.2.1         gtable_0.1.2      
[13] DBI_0.3.1          htmltools_0.2.6    yaml_2.1.13       
[16] assertthat_0.1     digest_0.6.8       RColorBrewer_1.1-2
[19] reshape2_1.4.1     formatR_1.2.1      evaluate_0.8      
[22] rmarkdown_0.8      labeling_0.3       stringi_0.5-5     
[25] scales_0.3.0       proto_0.3-10